Author: Christian O. Rosado
A temporal anlysis via time series clustering on 122 U.S. cities deaths over time.
# importing all dependencies
%matplotlib inline
import numpy as np
import pandas as pd
from pandas import DataFrame
import matplotlib.pylab as plt
import seaborn as sns
# Link:
#https://catalog.data.gov/dataset/deaths-in-122-u-s-cities-1962-2016-122-cities-mortality-reporting-system
!curl -O "https://data.cdc.gov/api/views/mr8w-325u/rows.csv?accessType=DOWNLOAD"
raw_data = pd.read_csv('Deaths_in_122_U.S._cities_-_1962-2016._122_Cities_Mortality_Reporting_System.csv')
deaths_df = pd.DataFrame(raw_data)
deaths_df.head()
deaths_by_year = deaths_df.groupby(['Year']).sum()
deaths_by_year.head()
# Population for top 4 US cities
'''
Rank City; State 2010 population
1 New York City; New York 8,175,133
2 Los Angeles; California 3,792,621
3 Chicago; Illinois 2,695,598
4 Houston; Texas 2,099,451
'''
deaths_by_city_and_year = deaths_df.groupby(['City', 'Year']).sum()
x = deaths_by_city_and_year[deaths_by_city_and_year['All Deaths'] > 90000]
x
deaths_per_city = deaths_by_city_and_year['All Deaths']
deaths_per_city.head()
allcities = set([dt[0] for dt in deaths_by_city_and_year.index])
plt.figure(figsize=(10,6))
for c in allcities:
plt.plot(deaths_by_city_and_year.loc[c].index, deaths_by_city_and_year.loc[c]['All Deaths'], label=c)
plt.title('Deaths Over Time for All Cities', fontsize=18)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Deaths', fontsize=16)
plt.text(1990, 80000, 'Gold Line -> NYC Death Counts', fontsize=13)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
#plt.legend()
plt.legend(bbox_to_anchor=(1.05, -.07), loc=3, ncol=5)
plt.figure(figsize=(10,6))
for c in allcities:
plt.semilogy(deaths_by_city_and_year.loc[c].index, deaths_by_city_and_year.loc[c]['All Deaths'], label=c)
plt.title('Deaths Over Time for All Cities \nSemi-log plot', fontsize=18)
plt.xlabel('Year', fontsize=16)
plt.ylabel('Deaths', fontsize=16)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
#plt.legend()
plt.legend(bbox_to_anchor=(1.05, -.07), loc=3, ncol=5)
#death rate per city per age group
#time series analysis per city, event dedect spikes and drops
#see if there is periodicity for cities?
for c in allcities:
mean = deaths_by_city_and_year.loc[c]['All Deaths'][:19].mean()
sigma = deaths_by_city_and_year.loc[c]['All Deaths'][:19].std()
threashhold = [[mean+(sigma*3)]*len(deaths_by_city_and_year.loc[c].index), [mean-(sigma*3)]*len(deaths_by_city_and_year.loc[c].index)]
plt.figure(figsize=(15,2))
plt.plot(deaths_by_city_and_year.loc[c].index, deaths_by_city_and_year.loc[c]['All Deaths'], label=c)
plt.plot(deaths_by_city_and_year.loc[c].index, threashhold[1], label='*Lower Control Limit')
plt.plot(deaths_by_city_and_year.loc[c].index, threashhold[0], label='*Upper Control Limit')
plt.xlabel('Year', fontsize=11)
plt.ylabel('Deaths', fontsize=11)
plt.title('Deaths Over Time: '+ c + '\nAll Deaths')
plt.legend()
for c in allcities:
mean = deaths_by_city_and_year.loc[c]['25-44 years'][:19].mean()
sigma = deaths_by_city_and_year.loc[c]['25-44 years'][:19].std()
threashhold = [[mean+(sigma*3)]*len(deaths_by_city_and_year.loc[c].index), [mean-(sigma*3)]*len(deaths_by_city_and_year.loc[c].index)]
plt.figure(figsize=(15,2))
plt.plot(deaths_by_city_and_year.loc[c].index, deaths_by_city_and_year.loc[c]['25-44 years'], label=c)
plt.plot(deaths_by_city_and_year.loc[c].index, threashhold[1], label='*Lower Control Limit')
plt.plot(deaths_by_city_and_year.loc[c].index, threashhold[0], label='*Upper Control Limit')
plt.xlabel('Year', fontsize=11)
plt.ylabel('Deaths', fontsize=11)
plt.title('Deaths Over Time: '+ c + '\nAges 25-44')
plt.legend()
deaths_by_city_and_year.index[130]
#deaths_by_city_and_year['All Deaths'][0]
city_deaths = {}
for i,j in enumerate(deaths_by_city_and_year.index):
city = deaths_by_city_and_year.index[i][0] # 0 -> city 1 -> year
year = deaths_by_city_and_year.index[i][1]
deaths = deaths_by_city_and_year['All Deaths'][i]
columns = ['All Deaths', '<1 year (all cause deaths)', '1-24 years (all cause deaths)',\
'25-44 years', '45-64 years (all cause deaths)', '65+ years (all cause deaths)']
for i,j in zip(range(1,7),columns):
ax = plt.subplot(3,2,i)
plt.tight_layout()
deaths_by_year.plot(deaths_by_year.index, j, figsize=(20, 10), ax=ax)
plt.xlabel('Year', fontsize=18)
plt.ylabel('Deaths', fontsize=18)
plt.legend(fontsize=15)
plt.title(j, fontsize=22)
There seems to be a spike in deaths in the 25-44 age group.
mean = deaths_by_year['25-44 years'][:19].mean()
sigma = deaths_by_year['25-44 years'][:19].std()
threashhold = [[mean+(sigma*3)]*len(deaths_by_year.index), [mean-(sigma*3)]*len(deaths_by_year.index)]
plt.figure(figsize=(20, 10))
plt.plot(deaths_by_year.index, deaths_by_year['25-44 years'], label='Deaths')
plt.plot(deaths_by_year.index, threashhold[1], label='*Lower Control Limit')
plt.plot(deaths_by_year.index, threashhold[0], label='*Upper Control Limit') #, color='#FF5A5A')
plt.xlabel('Year', fontsize=30)
plt.ylabel('Deaths', fontsize=30)
plt.legend(loc=0, fontsize=20)
plt.xticks(fontsize = 22)
plt.yticks(fontsize = 22)
plt.title('Deaths per Year: 1962-2016 \n People 25-44 Years Old', fontsize=32)
gca = plt.gca()
gca.set_ylim([25000,75000])
plt.annotate(s="Spike in Deaths", xy=(1987.5,71000), fontsize=22)
gca.add_patch(plt.Rectangle((1985.4,0), 12.2, 75600, alpha=.05, color="red"))
#plt.annotate(s="Drop \nin Deaths", xy=(2010,70000), fontsize=22)
#gca.add_patch(plt.Rectangle((2009.2,0), 7, 75600, alpha=.05, color="green"))
plt.text(1960, 11000, 'Data source: Data.gov, "Deaths in 122 U.S. Cities" | '
'Author: Christian Rosado \n *Control limits were calculated using '
'death counts from 1962-1985', fontsize=20)
The plot above shows a significant spike in deaths during the 80s and 90s decade. After doing some online research I learned that these two decades were plagued with high crime rates, drug use, and the HIV/AIDS epidemic. I also learned President Bill Clinton implemented the 1994 Crime Bill to cobate these statistics nationwide.
# Drugs
#http://www.gallup.com/poll/6352/decades-drug-use-80s-90s.aspx
# HIV/AIDS
#https://www.cdc.gov/mmwr/preview/mmwrhtml/mm5021a2.htm
#crime rates
#http://www.nationalreview.com/corner/427758/careful-panic-violent-crime-and-gun-crime-are-both-dropping-charles-c-w-cooke
#http://www.forbes.com/sites/neilhowe/2015/05/28/whats-behind-the-decline-in-crime/#87e72c077336
columns = ['All Deaths', '<1 year (all cause deaths)', '1-24 years (all cause deaths)',\
'25-44 years', '45-64 years (all cause deaths)', '65+ years (all cause deaths)']
plt.figure(figsize=(25, 20))
plt.subplot(2,2,1)
for i,j in zip(range(1,7),columns):
plt.plot(deaths_by_year.index, deaths_by_year[j])
plt.xlabel('Year', fontsize=22)
plt.ylabel('Deaths', fontsize=22)
plt.legend(loc='upper right', bbox_to_anchor=(1.65, 1), fontsize=20)
plt.xticks(fontsize = 22)
plt.yticks(fontsize = 22)
plt.title('Deaths vs. Year', fontsize=32)
#plt.figure(figsize=(25, 20))
plt.subplot(2,1,2)
for i,j in zip(range(1,7),columns):
plt.semilogy(deaths_by_year.index, deaths_by_year[j])
plt.xlabel('Year', fontsize=22)
plt.ylabel('Deaths', fontsize=22)
plt.legend(loc=0, fontsize=20)
plt.xticks(fontsize = 22)
plt.yticks(fontsize = 22)
plt.title('Deaths vs. Year \n Semi Log Plot: y-axis', fontsize=32)
deaths_by_city_and_year.head()
deaths_by_city_and_year.index[i][0] == 'Akron'
np.array(deaths_by_city_and_year.loc[c]['All Deaths'])-70000
df_all_deaths = pd.DataFrame(columns=['deaths', 'city', 'year'])
df_age_group_subset = pd.DataFrame(columns=['deaths', 'city', 'year'])
df_all_deaths
for c in allcities:
deaths2 = np.array(deaths_by_city_and_year.loc[c]['25-44 years'])
normalized2 = (deaths2-min(deaths2))/(max(deaths2)-min(deaths2))
#print normalized2
deaths_2 = pd.DataFrame(data=normalized2, columns=['deaths'])
#df.append(deaths)
city2 = []
i=1
while i <= len(deaths2):
city2.append(c)
i += 1
#print city2
city_2 = pd.DataFrame(data=city2, columns=['city'])
year2 = list(deaths_by_city_and_year.loc[c]['25-44 years'].index)
#print year2
year_2 = pd.DataFrame(data=year2, columns=['year'])
df_age_group_subset_ = pd.concat([deaths_2, city_2, year_2], axis=1)
df_age_group_subset = df_age_group_subset.append(df_age_group_subset_, ignore_index=True)
#df_age_group_subset
df_age_group_subset.describe()
for c in allcities:
deaths = np.array(deaths_by_city_and_year.loc[c]['All Deaths'])
normalized = (deaths-min(deaths))/(max(deaths)-min(deaths))
#print deaths
deaths_ = pd.DataFrame(data=normalized, columns=['deaths'])
#df.append(deaths)
city = []
i=1
while i <= len(deaths):
city.append(c)
i += 1
#print city
city_ = pd.DataFrame(data=city, columns=['city'])
year = list(deaths_by_city_and_year.loc[c]['25-44 years'].index)
#print year
year_ = pd.DataFrame(data=year, columns=['year'])
df_all_deaths_ = pd.concat([deaths_, city_, year_], axis=1)
df_all_deaths = df_all_deaths.append(df_all_deaths_, ignore_index=True)
#df_all_deaths
df_all_deaths.describe()
# TEST PLOT
plt.figure(figsize=(15,2))
plt.plot(df_age_group_subset.deaths[df_age_group_subset.city == 'Schenectady'])
Not all series have the same length due to nan values.
# verifying cities with length > 55
for c in allcities:
if len(df_age_group_subset.deaths[df_age_group_subset.city == c]) == 55:
pass
else:
print c
# Printing lengths for cities with length > 55
for c in allcities:
if len(df_age_group_subset.deaths[df_age_group_subset.city == c]) == 55:
pass
else:
print len(df_age_group_subset.deaths[df_age_group_subset.city == c])
plt.figure(figsize=(15,2))
plt.plot(range(len(df_age_group_subset.deaths[df_age_group_subset.city == 'Lexington'])),\
df_age_group_subset.deaths[df_age_group_subset.city == 'Lexington'], label=c)
plt.plot(range(len(df_age_group_subset.deaths[df_age_group_subset.city == 'Rochester'])),\
df_age_group_subset.deaths[df_age_group_subset.city == 'Rochester'], label=c)
# normalized city time series for deaths: ages 24-44
plt.figure(figsize=(15,2))
for c in allcities:
plt.plot(df_age_group_subset.year[df_age_group_subset.city == c], df_age_group_subset.deaths[df_age_group_subset.city == c], label=c)
# normalized city time series for all deaths
plt.figure(figsize=(15,2))
for c in allcities:
plt.plot(df_all_deaths.year[df_all_deaths.city == c], df_all_deaths.deaths[df_all_deaths.city == c], label=c)
df_all_deaths_cluster = pd.DataFrame(columns=allcities, index=range(1962,2017))
df_all_deaths_cluster.T.head()
# adding normalized death counts to empty cells - All Deaths
for c in allcities:
year = df_all_deaths.year[df_all_deaths.city == c]
for y in year:
df_all_deaths_cluster.set_value(y,c,np.array(df_all_deaths.deaths[df_all_deaths.city == c][df_all_deaths.year == y])[0])
df_all_deaths_cluster.T.head()
colors = ['#D91F1F', '#952540', '#502C61', '#0C3383', '#25957a', '#87CEEB', '#FF5721', 'w']
from sklearn.cluster import KMeans
# drop series with nan values - empty years
df_all_deaths_cluster.dropna(axis=1, inplace=True)
km = KMeans(n_clusters=3, random_state=1162)
labels = km.fit_predict(df_all_deaths_cluster.T)
labels
# 6 clusters of normalized city time series for all deaths - kMeans
plt.figure(figsize=(15,5))
for c,color in zip(df_all_deaths_cluster.columns,labels):
plt.plot(df_all_deaths_cluster.index, df_all_deaths_cluster[c], label=c, color='b' if color == 0 \
else 'r' if color == 1 \
else 'y' if color == 2 \
else 'hotpink' if color == 3 \
else 'orange' if color == 4 \
else 'g')
plt.title('Clustering Deaths Over Time in 122 U.S. Cities \
\nKMeans: 3 Clusters', fontsize=20)
plt.xlabel('Year', fontsize=20)
plt.ylabel('Death Count (normalized)', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
from sklearn.cluster import DBSCAN
db = DBSCAN(eps=.9, min_samples=10)
labels = db.fit_predict(df_all_deaths_cluster.T)
labels
# n clusters of normalized city time series for all deaths - DBSCAN
plt.figure(figsize=(20,5))
for c,color in zip(df_all_deaths_cluster.columns,labels):
plt.plot(df_all_deaths_cluster.index, df_all_deaths_cluster[c], label=c, color='b' if color == 0 \
else 'r' if color == 1 \
else 'y' if color == 2 \
else 'pink' if color == 3 \
else 'orange' if color == 4 \
else 'w')
plt.title('Clustering Deaths Over Time in 122 U.S. Cities \nDBSCAN(eps=.9, min_samples=10) \
\n3 Clusters', fontsize=20)
plt.xlabel('Year', fontsize=20)
plt.ylabel('Death Count (normalized)', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
df_all_deaths_cluster_Labels = df_all_deaths_cluster.T.astype(float).copy()
df_all_deaths_cluster_Labels["label"] = labels.astype(float)
df_all_deaths_cluster_Labels_Mean = df_all_deaths_cluster_Labels.groupby(['label']).mean()
df_all_deaths_cluster_Labels_Mean.head()
# n clusters of normalized city time series for all deaths
plt.figure(figsize=(20,5))
for c in df_all_deaths_cluster_Labels_Mean.index:
plt.plot(df_all_deaths_cluster_Labels_Mean.columns, df_all_deaths_cluster_Labels_Mean.loc[c], \
label='cluster '+str(c),\
color='b' if c == 0 \
else 'r' if c == 1 \
else 'y' if c == 2 \
else 'hotpink' if c == 3 \
else 'orange' if c == 4 \
else 'g')
plt.title('Clustering Deaths Over Time in 122 U.S. Cities \
\nDBSCAN(eps=.9, min_samples=10) \n3 Clusters (Means)', fontsize=20)
plt.xlabel('Year', fontsize=20)
plt.ylabel('Death Count (normalized)', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.legend(loc=0, fontsize=14)
df_age_group_subset_cluster = pd.DataFrame(columns=allcities, index=range(1962,2017))
df_age_group_subset_cluster.T.head()
# adding normalized death counts to empty cells - All Deaths
for c in allcities:
year = df_age_group_subset.year[df_age_group_subset.city == c]
for y in year:
df_age_group_subset_cluster.set_value(y,c,\
np.array(df_age_group_subset.deaths[df_age_group_subset.city == c][df_age_group_subset.year == y])[0])
df_age_group_subset_cluster.T.head()
# drop series with nan values - empty years
df_age_group_subset_cluster.dropna(axis=1, inplace=True)
km = KMeans(n_clusters=4, random_state=1162)
labels = km.fit_predict(df_age_group_subset_cluster.T)
labels
# 6 clusters of normalized city time series for deaths: ages 24-44 - kMeans
plt.figure(figsize=(15,5))
for c,color in zip(df_age_group_subset_cluster,labels):
plt.plot(df_age_group_subset_cluster.index, df_age_group_subset_cluster[c], label=c, color='b' if color == 0 \
else 'r' if color == 1 \
else 'y' if color == 2 \
else 'hotpink' if color == 3 \
else 'orange' if color == 4 \
else 'g')
plt.title('Clustering Deaths Over Time in 122 U.S. Cities \nAges: 25-44 \
\nKMeans: 4 Clusters', fontsize=20)
plt.xlabel('Year', fontsize=20)
plt.ylabel('Death Count (normalized)', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
db = DBSCAN(eps=.99, min_samples=2)
labels = db.fit_predict(df_age_group_subset_cluster.T)
labels
# n clusters of normalized city time series for deaths: ages 24-44 - DBSCAN
colors = ['#D91F1F', '#952540', '#502C61', '#0C3383', '#25957a', '#87CEEB', '#FF5721', 'w']
plt.figure(figsize=(20,5))
for c,color in zip(df_age_group_subset_cluster.columns,labels):
plt.plot(df_age_group_subset_cluster.index, df_age_group_subset_cluster[c], label=c, \
color=colors[0] if color == 0 \
else colors[1] if color == 1 \
else colors[2] if color == 2 \
else colors[3] if color == 3 \
else colors[4] if color == 4 \
else colors[7])
plt.title('Clustering Deaths Over Time in 122 U.S. Cities \nAges: 25-44 \
\nDBSCAN(eps=.99, min_samples=2) \n4 Clusters', fontsize=20)
plt.xlabel('Year', fontsize=20)
plt.ylabel('Death Count (normalized)', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
#plt.legend(loc=0)
df_age_group_subset_cluster_Labels = df_age_group_subset_cluster.T.astype(float).copy()
df_age_group_subset_cluster_Labels["label"] = labels.astype(float)
df_age_group_subset_cluster_Labels_Mean = df_age_group_subset_cluster_Labels.groupby(['label']).mean()
df_age_group_subset_cluster_Labels_Mean.head()
# Cluster Means
# n clusters of normalized city time series for deaths: ages 24-44 - DBSCAN
colors = ['#D91F1F', '#952540', '#502C61', '#0C3383', '#25957a', '#87CEEB', '#FF5721', 'w']
plt.figure(figsize=(20,5))
for c in df_age_group_subset_cluster_Labels_Mean.index:
plt.plot(df_age_group_subset_cluster_Labels_Mean.columns, df_age_group_subset_cluster_Labels_Mean.loc[c], \
label='cluster '+ str(c), \
color=colors[0] if c == 0 \
else colors[1] if c == 1 \
else colors[2] if c == 2 \
else colors[3] if c == 3 \
else colors[4] if c == 4 \
else colors[5])
plt.title('Clustering Deaths Over Time in 122 U.S. Cities \nAges: 25-44 \
\nDBSCAN(eps=.99, min_samples=2) \n4 Clusters (Means)', fontsize=20)
plt.xlabel('Year', fontsize=20)
plt.ylabel('Death Count (normalized)', fontsize=20)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.legend(loc=0, fontsize=14)
# high death count cities
df_age_group_subset_cluster_Labels.index[df_age_group_subset_cluster_Labels.label == 2]
df_age_group_subset_cluster_Labels.index[df_age_group_subset_cluster_Labels.label == 1]
df_age_group_subset_cluster_Labels.index[df_age_group_subset_cluster_Labels.label == -1]
# low death count cities
df_age_group_subset_cluster_Labels.index[df_age_group_subset_cluster_Labels.label == 0]